{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-21T12:45:16.551569Z",
     "start_time": "2020-04-21T12:45:15.418840Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score\n",
    "from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold\n",
    "from tqdm import tqdm\n",
    "import lightgbm as lgb\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import math\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "import gc\n",
    "from datetime import datetime\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "from itertools import combinations\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-21T12:45:16.555279Z",
     "start_time": "2020-04-21T12:45:16.553030Z"
    }
   },
   "outputs": [],
   "source": [
    "seed = 2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-21T12:45:17.569010Z",
     "start_time": "2020-04-21T12:45:16.556503Z"
    }
   },
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('./raw_data/used_car_train_20200313.csv', sep=' ')\n",
    "df_test = pd.read_csv('./raw_data/used_car_testA_20200313.csv', sep=' ')\n",
    "df_sub = pd.read_csv('./raw_data/used_car_sample_submit.csv', sep=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-21T12:45:17.608670Z",
     "start_time": "2020-04-21T12:45:17.570367Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature = pd.concat([df_train, df_test], sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-21T12:46:35.205670Z",
     "start_time": "2020-04-21T12:46:35.198364Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['regionCode'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:39:56.197801Z",
     "start_time": "2020-04-05T10:39:56.173093Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:00.687391Z",
     "start_time": "2020-04-05T10:39:56.199573Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "\n",
    "cols = ['bodyType', 'fuelType', 'gearbox']\n",
    "df_feature['gp'] = df_feature['brand'].astype(\n",
    "    'str') + df_feature['model'].astype('str')\n",
    "gp_col = 'gp'\n",
    "\n",
    "df_na = df_feature[cols].isna()\n",
    "# 根据分组计算众数\n",
    "df_mode = df_feature.groupby(gp_col)[cols].agg(\n",
    "    lambda x: stats.mode(x)[0][0])\n",
    "\n",
    "for col in cols:\n",
    "    na_series = df_na[col]\n",
    "    names = list(df_feature.loc[na_series, gp_col])\n",
    "\n",
    "    t = df_mode.loc[names, col]\n",
    "    t.index = df_feature.loc[na_series, col].index\n",
    "\n",
    "    df_feature.loc[na_series, col] = t\n",
    "\n",
    "del df_feature['gp']\n",
    "df_feature[cols].isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:00.705685Z",
     "start_time": "2020-04-05T10:40:00.688816Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['notRepairedDamage'] = df_feature['notRepairedDamage'].replace(\n",
    "    '-', 2)\n",
    "df_feature['notRepairedDamage'] = df_feature['notRepairedDamage'].astype(\n",
    "    'float')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:00.725928Z",
     "start_time": "2020-04-05T10:40:00.706719Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:02.262817Z",
     "start_time": "2020-04-05T10:40:02.247133Z"
    }
   },
   "outputs": [],
   "source": [
    "del df_feature['seller']\n",
    "del df_feature['offerType']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:00.731580Z",
     "start_time": "2020-04-05T10:40:00.726939Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['price'] = np.log1p(df_feature['price'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# feature engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:02.242536Z",
     "start_time": "2020-04-05T10:40:00.732750Z"
    },
    "hide_input": false
   },
   "outputs": [],
   "source": [
    "v_cols = ['v_'+str(i) for i in range(15)]\n",
    "\n",
    "df_feature['v_mean'] = df_feature[v_cols].mean(axis=1)\n",
    "df_feature['v_max'] = df_feature[v_cols].max(axis=1)\n",
    "df_feature['v_min'] = df_feature[v_cols].min(axis=1)\n",
    "df_feature['v_std'] = df_feature[v_cols].std(axis=1)\n",
    "\n",
    "for col in ['v_mean', 'v_max', 'v_min', 'v_std']:\n",
    "    df_feature[f'name_{col}_mean'] = df_feature.groupby('name')[\n",
    "        col].transform('mean')\n",
    "    df_feature[f'name_{col}_std'] = df_feature.groupby('name')[\n",
    "        col].transform('std')\n",
    "    df_feature[f'name_{col}_max'] = df_feature.groupby('name')[\n",
    "        col].transform('max')\n",
    "    df_feature[f'name_{col}_min'] = df_feature.groupby('name')[\n",
    "        col].transform('min')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:02.246058Z",
     "start_time": "2020-04-05T10:40:02.243926Z"
    },
    "hide_input": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# # 时序特征\n",
    "# df_sort = df_feature[['brand', 'model', 'creatDate', 'price']]\n",
    "# df_sort = df_sort.groupby(['brand', 'model', 'creatDate'])[\n",
    "#     'price'].mean().reset_index()\n",
    "# df_sort.rename(columns={'price': 'brand_model_day_price_mean'}, inplace=True)\n",
    "# df_sort = df_sort.sort_values(['brand', 'model', 'creatDate'])\n",
    "# df_sort['brand_model_day_shift1_price_mean'] = df_sort.groupby(\n",
    "#     ['brand', 'model'])['brand_model_day_price_mean'].shift(1)\n",
    "# del df_sort['brand_model_day_price_mean']\n",
    "# # # df_sort['brand_model_price_rolling3_mean'] = df_sort.groupby(\n",
    "# # #     ['brand', 'model'])['price'].shift().rolling(window=3, min_periods=3).mean()\n",
    "# # df_sort['brand_model_price_shift1'] = df_sort.groupby(\n",
    "# #     ['brand', 'model'])['price'].shift()\n",
    "# # df_sort.head()\n",
    "# # print(df_feature.shape)\n",
    "# df_feature = df_feature.merge(df_sort, how='left')\n",
    "# # print(df_feature.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:02.404555Z",
     "start_time": "2020-04-05T10:40:02.264126Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['name_count'] = df_feature.groupby(\n",
    "    ['name'])['SaleID'].transform('count')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:03.043697Z",
     "start_time": "2020-04-05T10:40:02.405934Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def date_parse(x):\n",
    "    year = int(str(x)[:4])\n",
    "    month = int(str(x)[4:6])\n",
    "    day = int(str(x)[6:8])\n",
    "\n",
    "    if month < 1:\n",
    "        month = 1\n",
    "\n",
    "    date = datetime(year, month, day)\n",
    "    return date\n",
    "\n",
    "\n",
    "df_feature['regDate'] = df_feature['regDate'].apply(date_parse)\n",
    "df_feature['creatDate'] = df_feature['creatDate'].apply(date_parse)\n",
    "df_feature['regDate_year'] = df_feature['regDate'].dt.year\n",
    "df_feature['creatDate_year'] = df_feature['creatDate'].dt.year\n",
    "df_feature['creatDate_month'] = df_feature['creatDate'].dt.month"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:03.055607Z",
     "start_time": "2020-04-05T10:40:03.044946Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['car_age_day'] = (\n",
    "    df_feature['creatDate'] - df_feature['regDate']).dt.days\n",
    "df_feature['car_age_year'] = round(df_feature['car_age_day'] / 365, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:40:03.109643Z",
     "start_time": "2020-04-05T10:40:03.060513Z"
    }
   },
   "outputs": [],
   "source": [
    "# 简单统计\n",
    "def stat(df, df_merge, group_by, agg):\n",
    "    group = df.groupby(group_by).agg(agg)\n",
    "\n",
    "    columns = []\n",
    "    for on, methods in agg.items():\n",
    "        for method in methods:\n",
    "            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))\n",
    "    group.columns = columns\n",
    "    group.reset_index(inplace=True)\n",
    "    df_merge = df_merge.merge(group, on=group_by, how='left')\n",
    "\n",
    "    del (group)\n",
    "    gc.collect()\n",
    "\n",
    "    return df_merge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:41:03.337430Z",
     "start_time": "2020-04-05T10:40:03.112423Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "l = ['name', 'model', 'brand', 'bodyType']\n",
    "for f1 in tqdm(l):\n",
    "    for f2 in v_cols:\n",
    "        df_feature = stat(df_feature, df_feature, [f1], {\n",
    "            f2: ['mean', 'max', 'min', 'std']})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:41:03.342276Z",
     "start_time": "2020-04-05T10:41:03.338810Z"
    },
    "hide_input": false
   },
   "outputs": [],
   "source": [
    "def statis_feat(df_know, df_unknow):\n",
    "    l = ['name', 'model', 'brand', 'bodyType']\n",
    "\n",
    "    combs = list(combinations(l, 2))\n",
    "    for t1, t2 in combs:\n",
    "        df_unknow = stat(df_know, df_unknow, [t1, t2], {\n",
    "                         'price': ['mean', 'max', 'min', 'std', 'median']})\n",
    "\n",
    "    for f in tqdm(l):\n",
    "        df_unknow = stat(df_know, df_unknow, [f], {\n",
    "                         'price': ['mean', 'max', 'min', 'std']})\n",
    "\n",
    "    return df_unknow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:41:15.812467Z",
     "start_time": "2020-04-05T10:41:03.343363Z"
    }
   },
   "outputs": [],
   "source": [
    "# 5折交叉\n",
    "df_train = df_feature[~df_feature['price'].isnull()]\n",
    "df_train = df_train.reset_index(drop=True)\n",
    "df_test = df_feature[df_feature['price'].isnull()]\n",
    "\n",
    "df_stas_feat = None\n",
    "kf = KFold(n_splits=5, random_state=seed, shuffle=True)\n",
    "for train_index, val_index in kf.split(df_train):\n",
    "    df_fold_train = df_train.iloc[train_index]\n",
    "    df_fold_val = df_train.iloc[val_index]\n",
    "\n",
    "    df_fold_val = statis_feat(df_fold_train, df_fold_val)\n",
    "    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)\n",
    "\n",
    "    del(df_fold_train)\n",
    "    del(df_fold_val)\n",
    "    gc.collect()\n",
    "\n",
    "df_test = statis_feat(df_train, df_test)\n",
    "df_feature = pd.concat([df_stas_feat, df_test], axis=0)\n",
    "\n",
    "del(df_stas_feat)\n",
    "del(df_train)\n",
    "del(df_test)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:41:15.828826Z",
     "start_time": "2020-04-05T10:41:15.813761Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['v_0_add_v_4'] = df_feature['v_0'] + df_feature['v_4']\n",
    "df_feature['v_0_add_v_8'] = df_feature['v_0'] + df_feature['v_8']\n",
    "df_feature['v_1_add_v_3'] = df_feature['v_1'] + df_feature['v_3']\n",
    "df_feature['v_1_add_v_4'] = df_feature['v_1'] + df_feature['v_4']\n",
    "df_feature['v_1_add_v_5'] = df_feature['v_1'] + df_feature['v_5']\n",
    "df_feature['v_1_add_v_12'] = df_feature['v_1'] + df_feature['v_12']\n",
    "df_feature['v_2_add_v_3'] = df_feature['v_2'] + df_feature['v_3']\n",
    "df_feature['v_4_add_v_11'] = df_feature['v_4'] + df_feature['v_11']\n",
    "df_feature['v_4_add_v_12'] = df_feature['v_4'] + df_feature['v_12']\n",
    "df_feature['v_0_add_v_12_add_v_14'] = df_feature['v_0'] + \\\n",
    "    df_feature['v_12'] + df_feature['v_14']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:41:15.837094Z",
     "start_time": "2020-04-05T10:41:15.830231Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['v_4_add_v_9_minu_v_13'] = df_feature['v_4'] + \\\n",
    "    df_feature['v_9'] - df_feature['v_13']\n",
    "df_feature['v_2_add_v_4_minu_v_11'] = df_feature['v_2'] + \\\n",
    "    df_feature['v_4'] - df_feature['v_11']\n",
    "df_feature['v_2_add_v_3_minu_v_11'] = df_feature['v_2'] + \\\n",
    "    df_feature['v_3'] - df_feature['v_11']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:41:16.102520Z",
     "start_time": "2020-04-05T10:41:15.907258Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:41:16.106612Z",
     "start_time": "2020-04-05T10:41:16.103566Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-05T10:41:19.566375Z",
     "start_time": "2020-04-05T10:41:16.107615Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.to_pickle('feature.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:dm] *",
   "language": "python",
   "name": "conda-env-dm-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
